import jax.numpy as jnp
import jax
from jax.experimental.host_callback import id_print
import matplotlib.pyplot as plt
from jax.lax import scan
from jax import grad, jit, vmap
import jax.random as random
from functools import partial
import numpy as onp
rng = random.PRNGKey(2022)
import flax.linen as nn
import optax


import scipy

import seaborn as sns
sns.set_style("darkgrid")
cm = sns.color_palette("mako_r", as_cmap=True)

def heatmap_data(positions, area_min=-2, area_max=2):
    
    def small_kernel(z, area_min, area_max):
        a = jnp.linspace(area_min, area_max, 512)
        x, y = jnp.meshgrid(a, a)
        dist = (x - z[0])**2 + (y - z[1])**2
        hm = jnp.exp(-350*dist)
        return hm

    #again we try to jit most of the code, but use the helper functions
    #since we cannot jit all of it because of the plt functions
    @jit
    def produce_heatmap(positions, area_min, area_max):
        return jnp.sum(vmap(small_kernel, in_axes=(0, None, None))(positions, area_min, area_max), axis=0)

    hm = produce_heatmap(positions, area_min, area_max) #np.sum(vmap(small_kernel)(to_plot), axis=0)
    return hm

def plot_heatmap(positions, area_min=-2, area_max=2):
    """
    positions: locations of all particles in R^2, array (J, 2)
    area_min: lowest x and y coordinate
    area_max: highest x and y coordinate
    
    will plot a heatmap of all particles in the area [area_min, area_max] x [area_min, area_max]
    """
    hm = heatmap_data(positions, area_min, area_max)
    extent = [area_min, area_max, area_max, area_min]
    im = plt.imshow(hm, cmap=cm, interpolation='nearest', extent=extent)
    ax = plt.gca()
    ax.invert_yaxis()
    return im


def sample_sphere(J):
    alphas = jnp.linspace(0, 2*jnp.pi * (1 - 1/J), J)
    xs = jnp.cos(alphas)
    ys = jnp.sin(alphas)
    mf = jnp.stack([xs, ys], axis=1)
    return mf

J = 20
sphere1 = sample_sphere(J//2) * 0.5 + 0.7
sphere2 = sample_sphere(J//2) * 0.5 - 0.7 
mf = jnp.concatenate((sphere1, sphere2))
plt.scatter(mf[:, 0], mf[:, 1]);


class FullyConnected(nn.Module):

    @nn.compact
    def __call__(self, x):
        in_size = x.shape[1]
        n_hidden = 256
        act = nn.relu
        x = nn.Dense(n_hidden)(x)
        x = act(x)
        x = nn.Dense(n_hidden)(x)
        x = act(x)
        x = nn.Dense(n_hidden)(x)
        x = act(x)
        x = nn.Dense(in_size)(x)
        return x
    
    

#some dummy input data. Flax is able to infer all the dimensions of the weights
#if we supply if with the kind of input data it has to expect
x = jnp.zeros(20).reshape((10, 2))
#initialize the model weights
denoiser = FullyConnected()
rng, srng = random.split(rng)
params = denoiser.init(srng, x)

#Initialize the optimizer
optimizer = optax.adam(1e-3)
opt_state = optimizer.init(params)


def loss_fn(params, model, rng, data):
    #Noise the Data
    rng, step_rng = random.split(rng)
    noise = random.normal(step_rng, data.shape)
    noised_data = 0.01 * data + noise
    
    #Predict the Noise from the output
    output = model.apply(params, noised_data)
    loss = jnp.mean((noise - output)**2)
    return loss

@partial(jit, static_argnums=[4])
def update_step(params, rng, batch, opt_state, model):
    val, grads = jax.value_and_grad(loss_fn)(params, model, rng, batch)
    updates, opt_state = optimizer.update(grads, opt_state)
    params = optax.apply_updates(params, updates)
    return val, params, opt_state


N_epochs = 60_000
train_size = mf.shape[0]
losses = []
for k in range(N_epochs):
    rng, step_rng = random.split(rng)
    loss, params, opt_state = update_step(params, step_rng, mf, opt_state, denoiser)
    losses.append(loss)
    if (k+1) % 5_000 == 0:
        mean_loss = onp.mean(onp.array(losses))
        losses = []
        print("Epoch %d,\t Loss %f " % (k+1, mean_loss))

Epoch 5000,	 Loss 0.001488 
Epoch 10000,	 Loss 0.000397 
Epoch 15000,	 Loss 0.000264 
Epoch 20000,	 Loss 0.000229 
Epoch 25000,	 Loss 0.000192 
Epoch 30000,	 Loss 0.000178 
Epoch 35000,	 Loss 0.000158 
Epoch 40000,	 Loss 0.000162 
Epoch 45000,	 Loss 0.000140 
Epoch 50000,	 Loss 0.000145 
Epoch 55000,	 Loss 0.000141 
Epoch 60000,	 Loss 0.000133


def sample(rng, N_samples, model, params):
    rng, step_rng = random.split(rng)
    noised_data = random.normal(step_rng, (N_samples, 2))
    predicted_noise = model.apply(params, noised_data)
    data = 100*(noised_data - predicted_noise)
    return data

N_samples = 1000
rng, srng = random.split(rng)
samples = sample(srng, N_samples, denoiser, params)
plot_heatmap(samples)

<matplotlib.image.AxesImage at 0x7fe37c233e80>


beta_min = 0.1
beta_max = 20

def get_alphas_betas(N):
    # We interpolate between taking small noising steps first (of size beta_min/N)
    # and taking larger steps at the end (of magnitude beta_max/N)
    # one can use any kind of beta scheduling here, and finding the best one is an open research question
    # The one we take is inspired by the scheduling chosen inhttps://github.com/yang-song/score_sde and 
    # has proven itself in practice
    
    betas = jnp.array([beta_min/N + i/(N*(N-1))*(beta_max-beta_min) for i in range(N)])
    # Note that N should be at least of the size beta_max so that all betas are positive
    
    alphas = jnp.cumprod(1 - betas)
    return alphas, betas

alphas, betas = get_alphas_betas(100)
plt.plot(alphas, label="Amount Signal")
plt.plot(1 - alphas, label="Amount Noise")
plt.legend()

<matplotlib.legend.Legend at 0x7fe37c189d30>


def loss_fn(params, model, rng, data, alphas):
    rng, step_rng = random.split(rng)
    alpha = random.choice(step_rng, alphas, (data.shape[0],1))
    
    rng, step_rng = random.split(rng)
    noise = random.normal(step_rng, data.shape)

    
    noised_data = data * alpha**0.5 + noise * (1 - alpha)**0.5

    output = model.apply(params, noised_data, alpha)
    loss = jnp.mean((noise - output)**2)
    
    return loss


@partial(jit, static_argnums=[4])
def update_step(params, rng, batch, opt_state, model, alphas):
    val, grads = jax.value_and_grad(loss_fn)(params, model, rng, batch, alphas)
    # jax.debug.print("grads: {x}", x=jax.tree_map(lambda a: jnp.mean(a), grads))
    updates, opt_state = optimizer.update(grads, opt_state)
    params = optax.apply_updates(params, updates)
    return val, params, opt_state


import flax.linen as nn

class FullyConnectedWithTime(nn.Module):
    """A simple model with multiple fully connected layers and some fourier features for the time variable."""

    @nn.compact
    def __call__(self, x, t):
        in_size = x.shape[1]
        n_hidden = 256
        act = nn.relu
        t = jnp.concatenate([t - 0.5, jnp.cos(2*jnp.pi*t), jnp.sin(2*jnp.pi*t), -jnp.cos(4*jnp.pi*t)],axis=1)
        x = jnp.concatenate([x, t],axis=1)
        x = nn.Dense(n_hidden)(x)
        x = act(x)
        x = nn.Dense(n_hidden)(x)
        x = act(x)
        x = nn.Dense(n_hidden)(x)
        x = act(x)
        x = nn.Dense(in_size)(x)
        return x
    
x = jnp.zeros(2*10).reshape((10, 2))
time = jnp.ones((10, 1))
#initialize the model weights
denoiser_multiple_steps = FullyConnectedWithTime()
rng, srng = random.split(rng)
params_multiple_steps = denoiser_multiple_steps.init(srng, x, time)

#Initialize the optimizer
optimizer = optax.adam(1e-3)
opt_state = optimizer.init(params_multiple_steps)


train_size = mf.shape[0]
losses = []
for k in range(N_epochs):
    rng, step_rng = random.split(rng)
    loss, params_multiple_steps, opt_state = update_step(params_multiple_steps, step_rng, mf, opt_state, denoiser_multiple_steps, alphas)
    losses.append(loss)
    if (k+1) % 5_000 == 0:
        mean_loss = onp.mean(onp.array(losses))
        print("Epoch %d,\t Loss %f " % (k+1, mean_loss))
        losses = []

Epoch 5000,	 Loss 0.196400 
Epoch 10000,	 Loss 0.181512 
Epoch 15000,	 Loss 0.176624 
Epoch 20000,	 Loss 0.171174 
Epoch 25000,	 Loss 0.170849 
Epoch 30000,	 Loss 0.169433 
Epoch 35000,	 Loss 0.170603 
Epoch 40000,	 Loss 0.167742 
Epoch 45000,	 Loss 0.169246 
Epoch 50000,	 Loss 0.170066 
Epoch 55000,	 Loss 0.167391 
Epoch 60000,	 Loss 0.167233


def sample_with_time(rng, N_samples, model, params, alphas, betas):
    rng, step_rng = random.split(rng)
    noised_data = random.normal(step_rng, (N_samples, 2))
    for i in range(len(betas)):
        beta = betas[-i]
        alpha = alphas[-i] * jnp.ones((noised_data.shape[0], 1))
        noise_guess = model.apply(params, noised_data, alpha)
        rng, step_rng = random.split(rng)
        denoised_guess = 1/alpha**0.5 * (noised_data - noise_guess*(1-alpha)**0.5)

        if i < len(betas)-1:
            new_noise = random.normal(step_rng, noised_data.shape)
            alpha_tm1 = alpha[-i-1]
            new_noised = denoised_guess * alpha_tm1**0.5 + (1-alpha)**0.5 * new_noise
            noised_data = new_noised
    
    return noised_data



rng, srng = random.split(rng)
samples = sample_with_time(srng, N_samples, denoiser_multiple_steps, params_multiple_steps, alphas, betas)
plot_heatmap(samples)

<matplotlib.image.AxesImage at 0x7fe3676920a0>


def sample_with_time(rng, N_samples, model, params, alphas, betas):
    rng, step_rng = random.split(rng)
    all_outputs = onp.zeros((len(betas)+1, N_samples, 2))
    noised_data = random.normal(step_rng, (N_samples, 2))
    all_outputs[0, :, :] = noised_data
    for i in range(len(betas)):
        beta = betas[-i]
        alpha = alphas[-i] * jnp.ones((noised_data.shape[0], 1))
        noise_guess = model.apply(params, noised_data, alpha)
        rng, step_rng = random.split(rng)
        new_noise = random.normal(step_rng, noised_data.shape)
        noised_data = 1/(1 - beta)**0.5 * (noised_data - beta/(1 - alpha)**0.5 * noise_guess)
        if i < len(betas)-1:
            noised_data += beta**0.5 * new_noise

        all_outputs[i+1, :, :] = noised_data
        
    return noised_data, all_outputs



rng, srng = random.split(rng)
samples, all_outputs = sample_with_time(srng, N_samples, denoiser_multiple_steps, params_multiple_steps, alphas, betas)
im = plot_heatmap(samples)


import matplotlib.animation as animation
fig = plt.figure( figsize=(8,8))
im = plot_heatmap(samples)

def animate(frame):
    im.set_array(heatmap_data(all_outputs[frame, :, :]))
    return [im]

anim = animation.FuncAnimation(fig, animate, frames=all_outputs.shape[0])
anim.save('samples.mp4', fps=10, extra_args=['-vcodec', 'libx264'])
from IPython.display import Video
Video("samples.mp4")

From Denoising Diffusion Models to Score-based generative models¶

From discrete time to continuous time¶

Goal of Generative Models¶

Abstract Description of Diffusion Models¶

Import of libraries and implementation of some plotting helpers (skip this)¶

Creating a toy dataset for training¶

First Try: How could we try to achieve the noising and denoising?¶

We use a simple, fully connected Neural Network¶

Define the loss and update Function¶

Let's train!¶

Plot the results¶

That did not work

What could we improve?

Add more denoising steps!

Predicting $x_{t-1}$ from $x_t$ does not work¶

Predicting $x_0$ from $x_t$ but only using that information partially, works!¶

Timescales¶

New Loss function that optimizes at a randomly chosen timescale¶

New Neural network which also takes the noise level as an input¶

Training looks as before¶

Sampling looks different¶

This seems to work! We are getting two spheres¶

Improving the above by not going to $x_0$ in between¶

This works nicely!¶

Animation of diffusion flow¶

Continuous-Time limit¶

References¶